@article{min2021cross,
  title={Cross-Modal Attention Consistency for Video-Audio Unsupervised Learning},
  author={Min, Shaobo and Dai, Qi and Xie, Hongtao and Gan, Chuang and Zhang, Yongdong and Wang, Jingdong},
  journal={arXiv preprint arXiv:2106.06939},
  year={2021}
}